In this task, to follow to instructions, I manipulated the data I have anc create new feature sets to apply PCA and lower the dimensions of the feature sets. Following codes show the required steps.
## Loading required package: anytime
## Loading required package: data.table
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: tidyr
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
##
## here
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
## Loading required package: ggplot2
require(anytime)
require(data.table)
require(dplyr)
require(tidyr)
require(plyr)
require(stats)
require(lubridate)
require(ggplot2)
MatchDataPath='/Users/furkancetinkaya/Desktop/ie582/582/df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds'
OddDataPath='/Users/furkancetinkaya/Desktop/ie582/582/df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds'
MatchData<-readRDS(MatchDataPath)
OddData<-readRDS(OddDataPath)
MatchData<-as.data.table(MatchData)
OddData<-as.data.table(OddData)
MatchData$date<-as.data.table(as.Date(anytime(MatchData[,date])))
OddData$date<-as.data.table(as.Date(anytime(OddData[,date])))
MatchData <- separate(MatchData, col=score, into=c("homescore","awayscore"),sep=":") ##seperates the home-away scores
MatchData$TotalGoal<-as.numeric(MatchData[,homescore])+as.numeric(MatchData[,awayscore]) ##calculates total match score
MatchData[,IsOver:=0]
MatchData[TotalGoal>2,IsOver:=1] ##logical for over games
MatchData[,result:=as.factor("away")] ## match score decision
MatchData[homescore>awayscore,result:=as.factor("home")]
MatchData[homescore==awayscore,result:=as.factor("tie")]
OddData=OddData[order(date)]
OddData1=OddData[betType=="ou"&totalhandicap==2.5]
OddData[,noofodds:=seq_len(.N), by = c("bookmaker","matchId","oddtype") ] ##computes last odds
OddData1[,noofodds:=seq_len(.N), by = c("bookmaker","matchId","oddtype") ] ##computes last odds for 2.5 handicap
OddData[,maxnoofodds:=max(noofodds),by=.(matchId,bookmaker,oddtype)]
OddData1[,maxnoofodds:=max(noofodds),by=.(matchId,bookmaker,oddtype)]
The following code illustrates the feature vector creation for “Pinnacle”. The codes were the same for different bookmakers. So I only added this one. Some columns in feature vector is totally or mostly consist of NA elements. So I omitted these columns.
feature_vector_pinnacle=dcast(OddData[noofodds==maxnoofodds&bookmaker=='Pinnacle'], matchId~oddtype+betType, value.var = 'odd') ##creates a data table in which bettype and oddtypes are compounded
feature_vector_pinnacle_1=dcast(OddData1[noofodds==maxnoofodds&bookmaker=='Pinnacle'], matchId~oddtype+betType, value.var = 'odd') ##same as above for handicap 2
feature_vector_pinnacle[,c("over_ou","under_ou"):=NULL] ##deleting columns with large number of NAs
feature_vector_pinnacle=merge(feature_vector_pinnacle,feature_vector_pinnacle_1,by="matchId")
rm(feature_vector_pinnacle_1)
feature_vector_pinnacle=merge(feature_vector_pinnacle,MatchData[,.(matchId, IsOver, result)],by="matchId") ## adding logical over values and match results
feature_vector_pinnacle=feature_vector_pinnacle[complete.cases(feature_vector_pinnacle)]
Applying PCA on the feature vectors:
require(plot_ly)
## Loading required package: plot_ly
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'plot_ly'
pca_pinnacle=princomp((feature_vector_pinnacle[,-c("matchId","IsOver","result")]), cor = TRUE )
pca_betsson=princomp((feature_vector_Betsson[,-c("matchId","IsOver","result")]), cor = TRUE )
pca_1xbet=princomp((feature_vector_1xBet[,-c("matchId","IsOver","result")]), cor = TRUE )
pca_bet365=princomp((feature_vector_bet365[,-c("matchId","IsOver","result")]), cor = TRUE )
pca_888sport=princomp((feature_vector_888sport[,-c("matchId","IsOver","result")]), cor = TRUE )
summary(pca_pinnacle, loadings = T) ##3 comp
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.7642558 1.4129641 1.0958472 0.68337545 0.40985038
## Proportion of Variance 0.4446569 0.2852096 0.1715544 0.06671457 0.02399676
## Cumulative Proportion 0.4446569 0.7298666 0.9014210 0.96813558 0.99213235
## Comp.6 Comp.7
## Standard deviation 0.191965397 0.134992057
## Proportion of Variance 0.005264388 0.002603265
## Cumulative Proportion 0.997396735 1.000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## 1_ah 0.625 0.333 0.703
## 2_ah -0.632 -0.314 0.707
## odd1_1x2 -0.101 0.364 -0.704 0.476 -0.129 -0.343
## odd2_1x2 0.448 -0.228 0.411 0.360 -0.241 -0.625
## oddX_1x2 0.522 0.555 0.640
## over_ou -0.493 -0.127 0.265 0.489 0.648 -0.101
## under_ou 0.523 -0.234 -0.310 0.703 -0.266
summary(pca_betsson, loadings = T) ##3 comp
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 2.3689168 1.9485679 1.5162902 0.311943912
## Proportion of Variance 0.4676472 0.3164097 0.1915947 0.008109084
## Cumulative Proportion 0.4676472 0.7840570 0.9756516 0.983760726
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.276549420 0.209642196 0.197695320 0.126336749
## Proportion of Variance 0.006373298 0.003662488 0.003256953 0.001330081
## Cumulative Proportion 0.990134025 0.993796512 0.997053466 0.998383547
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 0.0918991839 0.0786973689 0.0575757334 0.0379965335
## Proportion of Variance 0.0007037883 0.0005161063 0.0002762471 0.0001203114
## Cumulative Proportion 0.9990873352 0.9996034415 0.9998796886 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1_ha 0.234 0.365 0.279 0.112 0.113 0.156 0.328 0.152
## 12_dc 0.290 -0.328 -0.172 0.193 0.739 0.350 -0.239
## 1X_dc 0.255 0.362 0.241 0.152 -0.158
## 2_ha -0.415 0.184 0.175 0.295 0.361 -0.633
## NO_bts 0.224 -0.565 0.511 -0.375 0.210 0.412
## X2_dc -0.418 0.185 0.143
## YES_bts -0.216 0.570 0.297 -0.330 -0.432 0.433 -0.215
## odd1_1x2 0.253 0.349 0.276 0.153 0.147 0.136
## odd2_1x2 -0.414 0.230 0.232 0.226 0.731
## oddX_1x2 -0.316 0.294 0.199 0.265 0.144 0.330 -0.245 -0.661
## over_ou 0.210 -0.401 0.200 0.619 0.268 -0.445 0.311
## under_ou -0.237 0.393 -0.154 0.385 0.250 -0.658 -0.259 0.219
## Comp.10 Comp.11 Comp.12
## 1_ha 0.375 0.557 0.319
## 12_dc
## 1X_dc -0.636 0.230 -0.468
## 2_ha -0.183 0.312
## NO_bts
## X2_dc 0.484 0.180 -0.697
## YES_bts
## odd1_1x2 0.257 -0.764 -0.112
## odd2_1x2 -0.330
## oddX_1x2 0.274
## over_ou
## under_ou
summary(pca_1xbet, loadings = T) ##3 comp
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.2368478 1.6508249 1.4029525 1.1253847 0.84252670
## Proportion of Variance 0.4169573 0.2271019 0.1640230 0.1055409 0.05915427
## Cumulative Proportion 0.4169573 0.6440593 0.8080822 0.9136231 0.97277740
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.39367130 0.270727985 0.203343792 0.179567438
## Proportion of Variance 0.01291476 0.006107803 0.003445725 0.002687039
## Cumulative Proportion 0.98569216 0.991799960 0.995245685 0.997932724
## Comp.10 Comp.11 Comp.12
## Standard deviation 0.123990505 0.0849277936 0.0471268311
## Proportion of Variance 0.001281137 0.0006010608 0.0001850782
## Cumulative Proportion 0.999213861 0.9998149218 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1_ah 0.145 0.685 0.712
## 12_dc -0.398 -0.187 -0.143 -0.590 -0.215 0.108 0.590
## 1X_dc -0.139 0.411 0.466 -0.117 0.149
## 2_ah -0.706 0.696
## NO_bts 0.415 -0.467 -0.359 0.594 0.217 -0.252
## X2_dc 0.425 -0.167 -0.133 0.219
## YES_bts 0.147 -0.391 0.465 -0.155 0.708 -0.165 0.197
## odd1_1x2 -0.136 0.387 0.494 -0.136 0.205
## odd2_1x2 0.421 -0.176 -0.227 0.472
## oddX_1x2 0.422 0.165 -0.288 -0.237 0.140 -0.204
## over_ou -0.318 -0.391 0.129 -0.410 -0.143 -0.162 -0.693
## under_ou 0.366 0.318 -0.374 -0.101 -0.729 0.119
## Comp.10 Comp.11 Comp.12
## 1_ah
## 12_dc 0.170
## 1X_dc -0.433 -0.598
## 2_ah
## NO_bts
## X2_dc 0.184 0.560 -0.601
## YES_bts 0.103
## odd1_1x2 -0.163 0.602 0.354
## odd2_1x2 -0.651 -0.243 0.146
## oddX_1x2 0.615 -0.268 0.362
## over_ou -0.174
## under_ou -0.246
summary(pca_bet365, loadings = T) ## 3 comp
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 2.4714389 1.8204550 1.4984258 0.39558336
## Proportion of Variance 0.5090009 0.2761714 0.1871067 0.01304052
## Cumulative Proportion 0.5090009 0.7851722 0.9722789 0.98531940
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.240878211 0.214048089 0.179729628 0.120705765
## Proportion of Variance 0.004835193 0.003818049 0.002691895 0.001214157
## Cumulative Proportion 0.990154595 0.993972644 0.996664539 0.997878696
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 0.118558498 0.0770787576 0.0628859134 0.0387783533
## Proportion of Variance 0.001171343 0.0004950946 0.0003295532 0.0001253134
## Cumulative Proportion 0.999050039 0.9995451334 0.9998746866 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1_ha 0.169 0.458 0.231 0.119 0.139 0.214 0.648
## 12_dc 0.325 -0.288 0.632 0.120 0.259 -0.543 0.152
## 1X_dc 0.197 0.449 0.197 0.108 -0.207 -0.483
## 2_ha -0.398 0.221 0.194 0.351 0.101 -0.592
## NO_bts 0.132 0.133 -0.597 0.187 0.633 -0.186 0.353 0.106
## X2_dc -0.401 0.151 0.352 0.200
## YES_bts -0.162 -0.115 0.583 0.610 -0.429 -0.213
## odd1_1x2 0.195 0.441 0.229 0.122 0.142 -0.112
## odd2_1x2 -0.398 0.165 0.185 0.299 0.218
## oddX_1x2 -0.358 0.230 0.300 -0.164 0.195 0.523 -0.456
## over_ou 0.239 -0.344 0.304 0.463 -0.215 -0.316 0.574 -0.152 0.127
## under_ou -0.288 0.318 -0.228 0.339 -0.237 -0.589 -0.370 -0.288 0.141
## Comp.10 Comp.11 Comp.12
## 1_ha 0.334 0.306
## 12_dc
## 1X_dc -0.244 0.357 0.480
## 2_ha 0.503 -0.101
## NO_bts
## X2_dc -0.168 -0.354 0.693
## YES_bts
## odd1_1x2 -0.264 -0.717 -0.277
## odd2_1x2 -0.636 0.351 -0.314
## oddX_1x2 0.252 -0.329
## over_ou
## under_ou
summary(pca_888sport, loadings = T) ##2 component
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 2.4129029 1.8165505 0.77984006 0.3944692
## Proportion of Variance 0.5822101 0.3299856 0.06081505 0.0155606
## Cumulative Proportion 0.5822101 0.9121956 0.97301069 0.9885713
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.235514087 0.164289664 0.108786827 0.0948649228
## Proportion of Variance 0.005546688 0.002699109 0.001183457 0.0008999354
## Cumulative Proportion 0.994117974 0.996817084 0.998000541 0.9989004765
## Comp.9 Comp.10
## Standard deviation 0.0748832348 0.0734011993
## Proportion of Variance 0.0005607499 0.0005387736
## Cumulative Proportion 0.9994612264 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1_ha 0.177 0.487 0.205 0.172 0.150 0.464 0.199 0.327
## 12_dc 0.334 -0.281 -0.666 0.592
## 1X_dc 0.204 0.472 0.158 -0.607 0.190 0.406
## 2_ha -0.402 0.263 0.332 0.230 -0.366 -0.574 0.150
## X2_dc -0.404 0.222 -0.132 0.479 -0.139 0.473
## odd1_1x2 0.205 0.469 0.191 0.185 0.125 0.205 -0.419 -0.557
## odd2_1x2 -0.400 0.253 0.309 0.431 0.613 -0.305
## oddX_1x2 -0.351 0.256 0.266 -0.244 -0.730 0.144 -0.275
## over_ou 0.264 -0.281 0.711 -0.274 -0.484 0.177
## under_ou -0.311 0.292 -0.367 -0.637 -0.376 0.364
## Comp.10
## 1_ha 0.525
## 12_dc
## 1X_dc -0.369
## 2_ha 0.344
## X2_dc -0.530
## odd1_1x2 -0.352
## odd2_1x2 -0.136
## oddX_1x2 0.203
## over_ou
## under_ou
In the summary tables, we can see the std. deviation values of components. The ones with the largest std.dev. is the most important, therefore the ones the reflect the most importance. Loadings value shows the eigenvectors of the components. They show in which direction the values will be rotated. In the first 4 cases, 3 components are required to keep more than the 80% of the information.However, in the last case, only two components is enough. Therefore, I plotted the first 4 cases in 3D, the last one in 2D. In the 3d plot, 0 is under 2.5, 1 is over 2.5.
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
PCA results seem good. Under-over values spread among the dots, and their shapes seem in-line with the overall shape.
Then, euclidian and manhattan distances are calculated for feature matrices. After that, MDS is applied to all distance matrices and corresponding plots are plotted.
In some cases, Euclidian distances shapes are sharper than manhattan distances. Generally, both alternatives performed good.
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
In this task, an image is scanned, then different channels are plotted. Noise added to image, and noisy image is plotted with different channels.
## Loading required package: jpeg
## num [1:128, 1:128, 1:3] 0.0941 0.0667 0.1647 0.1569 0.0863 ...
Grayscale version of the image is created, then patches matrix are created from the image. I applied PCA to patches matrix. Then, image is resconstructed using first 3 components. Then, first three components are printed as well.
image_gs <-noisy_image[,,1]+noisy_image[,,2]+noisy_image[,,3]
image_gs<-image_gs/max(image_gs)
plot(c(0, 200), c(0, 200), type = "n", xlab = "", ylab = "")
rasterImage(image_gs, 0, 0, 200, 200, interpolate = T)
str(image_gs)
## num [1:128, 1:128] 0.192 0.153 0.24 0.25 0.169 ...
##for loop to find patches
patches<-matrix(1:142884,nrow=15876,ncol=9)
counter=1
for(i in c(1:126)){for( j in c(1:126)){
myvector<-image_gs[i:(i+2),j:(j+2)]
myvector<-as.vector(myvector)
patches[counter,]<-myvector
counter=counter+1
}}
pca_4<-princomp(patches)
summary(pca_4, loadings = TRUE)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.4733564 0.1682056 0.13852967 0.10412156
## Proportion of Variance 0.7284955 0.0919880 0.06239303 0.03524778
## Cumulative Proportion 0.7284955 0.8204835 0.88287652 0.91812429
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.09534074 0.07109206 0.06914065 0.06295340
## Proportion of Variance 0.02955340 0.01643209 0.01554237 0.01288513
## Cumulative Proportion 0.94767769 0.96410978 0.97965215 0.99253728
## Comp.9
## Standard deviation 0.04790969
## Proportion of Variance 0.00746272
## Cumulative Proportion 1.00000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## [1,] 0.322 0.373 0.362 0.520 0.206 0.171 0.333 0.344 0.235
## [2,] 0.343 0.486 -0.382 0.362 -0.508 -0.331
## [3,] 0.321 -0.385 0.363 -0.479 0.282 0.170 0.360 -0.305 0.237
## [4,] 0.336 0.460 0.241 -0.424 -0.535 -0.389
## [5,] 0.360 -0.595 -0.510 0.504
## [6,] 0.335 -0.464 0.239 -0.427 0.534 -0.382
## [7,] 0.320 0.384 -0.353 -0.480 0.283 0.174 -0.361 0.308 0.244
## [8,] 0.341 -0.487 -0.376 0.351 0.516 -0.336
## [9,] 0.320 -0.372 -0.372 0.515 0.204 0.189 -0.321 -0.348 0.235
##rescaling
image1<-matrix(pca_4$scores[,1], ncol=126,nrow = 126)
image1<-(image1-min(image1))/(max(image1)-min(image1))
par(mfrow=c(1,3))
##images reconstructed by components
plot(c(0, 200), c(0, 200), type = "n", xlab = "", ylab = "", main="Reconstructed Image by Comp 1")
rasterImage(t(image1), 0, 0, 200, 200, interpolate = T)
image2<-matrix(pca_4$scores[,2], ncol=126,nrow = 126)
image2<-(image2-min(image2))/(max(image2)-min(image2))
plot(c(0, 200), c(0, 200), type = "n", xlab = "", ylab = "",main="Reconstructed Image by Comp 2")
rasterImage(t(image2), 0, 0, 200, 200, interpolate = T)
image3<-matrix(pca_4$scores[,2], ncol=126,nrow = 126)
image3<-(image3-min(image3))/(max(image3)-min(image3))
plot(c(0, 200), c(0, 200), type = "n", xlab = "", ylab = "",main="Reconstructed Image by Comp 3")
rasterImage(t(image3), 0, 0, 200, 200, interpolate = T)
##eigenvector matrix creation and rescaling
eigen1<-matrix(pca_4$loadings[,1],nrow=3,ncol=3)
eigen1<-(eigen1-min(eigen1))/(max(eigen1)-min(eigen1))
eigen2<-matrix(pca_4$loadings[,2],nrow=3,ncol=3)
eigen2<-(eigen2-min(eigen2))/(max(eigen2)-min(eigen2))
eigen3<-matrix(pca_4$loadings[,3],nrow=3,ncol=3)
eigen3<-(eigen3-min(eigen3))/(max(eigen3)-min(eigen3))
##plotting eigenvector matrices
plot(c(0, 3), c(0, 3), type = "n", xlab = "", ylab = "", main="Component 1 - Eigenvector")
rasterImage(t(eigen1), 0, 0, 3, 3, interpolate = T)
plot(c(0, 3), c(0,3), type = "n", xlab = "", ylab = "", main="Component 2 - Eigenvector")
rasterImage(t(eigen2), 0, 0, 3, 3, interpolate = T)
plot(c(0, 3), c(0, 3), type = "n", xlab = "", ylab = "", main="Component 3 - Eigenvector")
rasterImage(t(eigen3), 0, 0, 3, 3, interpolate = T)
As it can be seen, first componenet is keeping the most information about the original photo. The second and third component stores less information, however they can be used for edge detection or for other purposes.